Intro

Dados

full <- readr::read_csv("train.csv")

set.seed(314)
test <- 
  full %>% 
  group_by(target) %>% 
  sample_frac(0.2) %>% 
  ungroup()

train <- full %>% filter(!id %in% test$id) 

Redução de dimensionalidade

Técnicas de redução de dimensionalidade serão aplicadas a fim de entender de maneira visual como os dados estão distribuidos.

set.seed(314)
to_pca <- 
  train %>% 
  group_by(target) %>% 
  sample_frac(.1) %>% 
  ungroup()

Correlação

to_pca %>% 
  select(-id, -target) %>% 
  cor(method = 'spearman') %>% 
  heatmaply::heatmaply_cor(
    xlab = "Features",
    ylab = "Features",
    k_col = 4,
    k_row = 8
  )

A correlação é inexistente entre as features deste dataset, o que inviabiliza o uso de alguns recursos de redução de dimensionalidade

PCA

res.pca <- 
  to_pca %>% 
  select(-id, -target) %>%  
  FactoMineR::PCA(graph = FALSE, scale.unit = T)

# get_eigenvalue(res.pca)
factoextra::fviz_eig(res.pca, addlabels = TRUE, ncp = 50)

t-SNE - t-Distributed Stochastic Neighbor Embedding

Este algoritmo é uma ótima ferramenta para ajudar na compreensão de dados de alta dimensionalidade porém não é tão útil para aplicar a redução de dimensionalidade para treinamento de modelos de machine learning

tsne_tps <- to_pca %>% 
  select(-id, -target) %>% 
  Rtsne::Rtsne(dims=2, perplexity=30, 
               PCA=FALSE,
               verbose=T, max_iter=500, 
               check_duplicates = FALSE)
saveRDS(tsne_tps, "tsne_tps.rds")
tsne_tps <- readRDS("tsne_tps.rds")

as_tibble(tsne_tps$Y) %>% 
  bind_cols(select(to_pca, target)) %>% 
  ggplot(aes(x=V1, y=V2, col=target))+
  geom_point()+
  labs(title = "t-SNE")

UMAP - Uniform Manifold Approximation and Projection

umap_tps <- umap::umap(select(to_tsne, -id, -target))
saveRDS(umap_tps, "umap_tps.rds")
umap_tps <- readRDS("umap_tps.rds")

# predict(umap_tps, select(test, -id, -target))
as_tibble(umap_tps$layout) %>% 
  bind_cols(select(train, target)) %>% 
  ggplot(aes(x=V1, y=V2, col=target))+
  geom_point()+
  labs(title = "UMAP")

Redes Neurais

batch_size <- 128
epochs <- 500

AE - AutoEncoder

x_train <- 
  full %>% 
  select(-id, -target) %>% 
  mutate_all(scale) %>% 
  as.matrix()

model <- keras_model_sequential()

model %>%
  layer_dense(units = 564, activation = "relu", input_shape = ncol(x_train)) %>%
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 2, activation = "relu", name = "bottleneck") %>%
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 564, activation = "relu") %>%
  layer_dense(units = ncol(x_train), activation = "linear")

summary(model)
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_4 (Dense)                     (None, 564)                     28764       
## ________________________________________________________________________________
## dense_3 (Dense)                     (None, 256)                     144640      
## ________________________________________________________________________________
## bottleneck (Dense)                  (None, 2)                       514         
## ________________________________________________________________________________
## dense_2 (Dense)                     (None, 256)                     768         
## ________________________________________________________________________________
## dense_1 (Dense)                     (None, 564)                     144948      
## ________________________________________________________________________________
## dense (Dense)                       (None, 50)                      28250       
## ================================================================================
## Total params: 347,884
## Trainable params: 347,884
## Non-trainable params: 0
## ________________________________________________________________________________
model %>% compile(
  metrics = c("accuracy"),
  loss = "mean_squared_error", 
  optimizer = optimizer_adam(
    lr = 0.001 ))
tictoc::tic()
history <- model %>%
  fit(x = x_train, y = x_train, 
      epochs = epochs,
      validation_split =.2,
      view_metrics = TRUE, 
      callbacks=list(callback_early_stopping(
        monitor = "val_loss",
        min_delta = 0.01,
        patience = 50,
        restore_best_weights = TRUE
      )),
      verbose=2)
tictoc::toc()

model %>% save_model_tf("model_ae")
saveRDS(history, "history_ae.rds")
model <- load_model_tf("model_ae")
history <- readRDS("history_ae.rds")
history
## 
## Final epoch (plot to see history):
##         loss: 0.656
##     accuracy: 0.4906
##     val_loss: 0.6757
## val_accuracy: 0.4726
# gap = epochs - nrow(as_metrics_df(history))
# gap_tbl = tibble(loss= rep(NA_real_, gap),
#                  accuracy= rep(NA_real_, gap),
#                  val_loss= rep(NA_real_, gap),
#                  val_accuracy= rep(NA_real_, gap)
# )

# bind_rows(as_metrics_df(history), gap_tbl ) %>% 
as_metrics_df(history) %>%
  mutate(epochs = 1:nrow(.)) %>% 
  gather(key, val, -epochs) %>% 
  mutate(metric = case_when(
    str_detect(key, "accuracy") ~ "accuracy",
    str_detect(key, "loss") ~ "log_loss" )) %>% 
  ggplot(aes(x = epochs, y = val, col=key)) +
  geom_point()+
  geom_smooth(se = F)+
  theme_bw()+
  facet_wrap(~metric, scales = "free_y")

# evaluate the performance of the model
mse.ae2 <- evaluate(model, x_train, x_train)
mse.ae2
##      loss  accuracy 
## 0.6329905 0.5052900
intermediate_layer_model <- keras_model(inputs = model$input, outputs = get_layer(model, "bottleneck")$output)
intermediate_output <- predict(intermediate_layer_model, x_train)

ggplot(data.frame(PC1 = intermediate_output[,1],
                  PC2 = intermediate_output[,2]),
       aes(x = PC1, y = PC2, col = full$target)) + 
  geom_point(alpha=.5)

# train_dae <- predict(model, x_train)
# train_dae %>% as_tibble()

DAE - Denoise AutoEncoder

samples <- sample(1:2, size = nrow(full), replace = T, prob = c(0.8, 0.2))

# set training data
x_train <- 
  full %>% 
  filter(samples==1)  %>% 
  select(-id, -target) %>% 
  mutate_all(scale) %>% 
  as.matrix()

x_val <- 
  full %>% 
  filter(samples==2)  %>% 
  select(-id, -target) %>% 
  mutate_all(scale) %>% 
  as.matrix()
model <- keras_model_sequential()

model %>%
  layer_dense(units = 564, activation = "relu", input_shape = ncol(x_train)) %>%
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 2, activation = "relu", name = "bottleneck") %>%
  layer_dense(units = 256, activation = "relu") %>%
  layer_dense(units = 564, activation = "relu") %>%
  layer_dense(units = ncol(x_train), activation = "linear")

summary(model)
## Model: "sequential_1"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_9 (Dense)                     (None, 564)                     28764       
## ________________________________________________________________________________
## dense_8 (Dense)                     (None, 256)                     144640      
## ________________________________________________________________________________
## bottleneck (Dense)                  (None, 2)                       514         
## ________________________________________________________________________________
## dense_7 (Dense)                     (None, 256)                     768         
## ________________________________________________________________________________
## dense_6 (Dense)                     (None, 564)                     144948      
## ________________________________________________________________________________
## dense_5 (Dense)                     (None, 50)                      28250       
## ================================================================================
## Total params: 347,884
## Trainable params: 347,884
## Non-trainable params: 0
## ________________________________________________________________________________
model %>% compile(
  metrics = c("accuracy"),
  loss = "mean_squared_error", 
  optimizer = optimizer_adam(
    lr = 0.001 ))
batch_generator <- function(data, batch_size, perc=.35)
{
  function() {
    
    data_original <- data
    ids <- sample(1:nrow(data), batch_size, replace = FALSE)
    
    tryCatch({
      
      for(j in 1:ncol(data)){
        to_shuffle <- sample(c(T, F), size = nrow(data[ids, ]), replace = T, prob = c(perc, 1-perc))
        data[ids,][which(to_shuffle), j] <- sample(data[ids,][which(to_shuffle), j])
      }
      
    }, error=function(e){
      print("Deu merda no shuffle!")
      data = data
    })
    
    list(
      as.matrix(data[ids, ]),
      as.matrix(data_original[ids, ])
    )
  }
}

steps_per_epoch <- 600 #floor(nrow(x_train)/batch_size)
validation_steps <- 100 #floor(nrow(x_val)/batch_size)
tictoc::tic()
history <- model %>%
  fit(batch_generator(x_train, batch_size),
      steps_per_epoch = steps_per_epoch,
      validation_steps = validation_steps,
      validation_data = batch_generator(x_val, batch_size),
      epochs = epochs, 
      view_metrics = TRUE, 
      callbacks=list(callback_early_stopping(
        monitor = "val_loss",
        min_delta = 0.01,
        patience = 50,
        restore_best_weights = TRUE
      )),
      verbose=2)
tictoc::toc()

saveRDS(model, "model_dae.rds") 
saveRDS(history, "history_dae.rds") 
history <- readRDS("history_dae.rds") 
history
## 
## Final epoch (plot to see history):
##         loss: 0.7782
##     accuracy: 0.3716
##     val_loss: 0.7931
## val_accuracy: 0.3698
gap = epochs - nrow(as_metrics_df(history))
gap_tbl = tibble(loss= rep(NA_real_, gap),
                 accuracy= rep(NA_real_, gap),
                 val_loss= rep(NA_real_, gap),
                 val_accuracy= rep(NA_real_, gap)
)

bind_rows(as_metrics_df(history), gap_tbl ) %>% 
  mutate(epochs = 1:nrow(.)) %>% 
  gather(key, val, -epochs) %>% 
  mutate(metric = case_when(
    str_detect(key, "accuracy") ~ "accuracy",
    str_detect(key, "loss") ~ "log_loss" )) %>% 
  ggplot(aes(x = epochs, y = val, col=key)) +
  geom_point()+
  geom_smooth(se = F)+
  theme_bw()+
  facet_wrap(~metric, scales = "free_y")